MidTerm Telecom Customer Churn Analysis Emma Horton – Data Visualization Midterm Project
Libraries Used:
library(ggplot2)
library(dplyr)
library(plotly)
library(lsr)
library(broom)
library(htmlwidgets)
Data Loading
telco <- read.csv("Telco-Customer-Churn.csv", na.strings = c(""))
#str(telco)
#summary(telco)
#head(telco, 5)
Data Cleaning
# SeniorCitizen to factor
telco$SeniorCitizen <- factor(telco$SeniorCitizen, levels=c(0,1), labels=c("No", "Yes"))
# Character columns to factors
cat_cols <- c("gender","Partner","Dependents","PhoneService",
"MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
"DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
"Contract","PaperlessBilling","PaymentMethod","Churn")
telco[cat_cols] <- lapply(telco[cat_cols], factor)
# Fix TotalCharges: convert to numeric
telco$TotalCharges <- as.numeric(as.character(telco$TotalCharges))
# Replace "No internet service" and "No phone service" with "No", replace "No phone service" with "No"
no_internet_cols <- c("OnlineSecurity","OnlineBackup","DeviceProtection",
"TechSupport","StreamingTV","StreamingMovies")
for(col in no_internet_cols) {
telco[[col]] <- factor(ifelse(telco[[col]]=="No internet service", "No",
as.character(telco[[col]])))
}
telco$MultipleLines <- factor(ifelse(telco$MultipleLines=="No phone service", "No",
as.character(telco$MultipleLines)))
# Drop customerID
telco$customerID <- NULL
# Verify results
#sum(is.na(telco$TotalCharges))
#levels(telco$MultipleLines)
#levels(telco$OnlineSecurity)
Set Color Pallet
project_colors <- c(
"Yes" = "#E74C3C",
"No" = "#3498DB",
"Female" = "#9B59B6",
"Male" = "#1ABC9C",
"Month-to-month" = "#F39C12",
"One year" = "#2ECC71",
"Two year" = "#34495E",
"DSL" = "#16A085",
"Fiber optic" = "#D35400",
"No" = "#95A5A6"
)
Exploratory Analysis Distribution of a single categorical variable 1. Categorical Variable Distribution – Contract Type vs Churn (Bar Chart) A key categorical variable is the Contract type. I suspect contract length has an effect on churn. I will plot the count of customers by contract type, and distinguish churn vs not churn with color.
contract_churn_data <- telco %>%
group_by(Contract, Churn) %>%
summarise(count = n(), .groups = 'drop')
plot_ly(contract_churn_data,
x = ~Contract,
y = ~count,
color = ~Churn,
colors = project_colors,
type = "bar") %>%
layout(
barmode = "group",
title = list(text = "Customer Count by Contract Type and Churn Status", x = 0.5),
xaxis = list(title = "Contract Type"),
yaxis = list(title = "Number of Customers"),
legend = list(title = list(text = "Churn"))
)
## Warning: Duplicate levels detected
## Duplicate levels detected
This is a strong indicator that contract length is related to churn. I will perform a deeper dive into the relationship
# Cross-tab Churn by Contract
table(telco$Churn, telco$Contract)
##
## Month-to-month One year Two year
## No 2220 1307 1647
## Yes 1655 166 48
# Chi-square
chisq_test <- chisq.test(table(telco$Churn, telco$Contract))
chisq_test
##
## Pearson's Chi-squared test
##
## data: table(telco$Churn, telco$Contract)
## X-squared = 1184.6, df = 2, p-value < 2.2e-16
# mosaic
mosaicplot(table(telco$Contract, telco$Churn),
color = c(project_colors["No"], project_colors["Yes"]),
main = "Churn vs Contract Type",
xlab = "Contract Type", ylab = "Churn")
Distribution of a single quantitative variable 2. Numerical Variable Distribution – Tenure (Histogram) I want to explore the distribution of tenure (how long customers have been with the company, in months). A histogram will show the frequency of customers by their tenure.
ggplot(telco, aes(x = tenure)) +
geom_histogram(binwidth = 5, fill=project_colors["No"], color="black") +
labs(title="Distribution of Customer Tenure",
x="Tenure (months)", y="Number of Customers") +
theme_minimal()
I want to explore the representation of churn status in regards to
tenure, faceting on contract type.
ggplot(telco, aes(x = tenure, fill = Churn)) +
geom_histogram(binwidth = 5, color = "black", position = "stack") +
facet_wrap(~ Contract) +
scale_fill_manual(values = project_colors) +
labs(
title = "Distribution of Customer Tenure by Contract Type and Churn Status",
x = "Tenure (months)",
y = "Number of Customers",
fill = "Churn"
) +
theme_minimal()
Distribution of two categorical variables 3. Two Categorical Variable Distribution – Payment Method vs. Churn (Grouped Bar) I want to explore the relationship between payment method (bank transfer, credit card, electronic / mailed check) and churn. A grouped bar will display the relationship between payment method and churn.
payment_churn_data <- telco %>%
group_by(PaymentMethod, Churn) %>%
summarise(count = n(), .groups = "drop")
# Remove "(automatic)" from PaymentMethod labels
telco$PaymentMethod <- gsub("\\s*\\(automatic\\)", "", telco$PaymentMethod)
plot_ly(payment_churn_data,
x = ~PaymentMethod,
y = ~count,
color = ~Churn,
colors = project_colors,
type = "bar") %>%
layout(
barmode = "group",
title = list(text = "Customer Churn by Payment Method", x = 0.5),
xaxis = list(title = "Payment Method", tickangle = 30),
yaxis = list(title = "Number of Customers"),
legend = list(title = list(text = "Churn"))
)
## Warning: Duplicate levels detected
## Duplicate levels detected
I want to explore how contract type will impact the distribution
ggplot(telco, aes(x = PaymentMethod, fill = Churn)) +
geom_bar(position = "dodge") +
facet_wrap(~ Contract) +
scale_fill_manual(values = project_colors) +
labs(
title = "Churn Distribution by Payment Method and Contract Type",
x = "Payment Method",
y = "Customer Count",
fill = "Churn"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
4. Quantitative Variable Across Categories – Monthly Charges by Contract
Type (Boxplot) I want explore how monthly charges differ by contract
type.
ggplot(telco, aes(x = Contract, y = MonthlyCharges, fill = Contract)) +
geom_boxplot() +
scale_fill_manual(values = project_colors) +
labs(
title = "Monthly Charges by Contract Type",
x = "Contract Type",
y = "Monthly Charges (USD)"
) +
theme_minimal()
Instead of mucking around to figure out the reason why I suspect a quick
regression will give me more answers in less time.
Quick regression
reg_data <- telco %>%
select(-TotalCharges) %>%
na.omit()
reg_data <- reg_data %>%
mutate(across(where(is.character), as.factor))
model <- lm(MonthlyCharges ~ ., data = reg_data)
summary(model)
##
## Call:
## lm(formula = MonthlyCharges ~ ., data = reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2285 -0.6140 -0.0057 0.6070 4.8419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.498e+01 6.001e-02 416.193 <2e-16 ***
## genderMale 2.336e-02 2.448e-02 0.954 0.340
## SeniorCitizenYes 1.511e-02 3.565e-02 0.424 0.672
## PartnerYes -3.945e-02 2.958e-02 -1.334 0.182
## DependentsYes 1.287e-02 3.140e-02 0.410 0.682
## tenure 5.209e-06 8.443e-04 0.006 0.995
## PhoneServiceYes 2.005e+01 4.816e-02 416.349 <2e-16 ***
## MultipleLinesYes 5.018e+00 2.955e-02 169.829 <2e-16 ***
## InternetServiceFiber optic 2.496e+01 3.518e-02 709.555 <2e-16 ***
## InternetServiceNo -2.505e+01 4.893e-02 -511.848 <2e-16 ***
## OnlineSecurityYes 5.014e+00 3.224e-02 155.490 <2e-16 ***
## OnlineBackupYes 4.992e+00 3.025e-02 165.060 <2e-16 ***
## DeviceProtectionYes 5.022e+00 3.133e-02 160.289 <2e-16 ***
## TechSupportYes 5.030e+00 3.287e-02 153.037 <2e-16 ***
## StreamingTVYes 9.974e+00 3.207e-02 311.040 <2e-16 ***
## StreamingMoviesYes 9.967e+00 3.209e-02 310.579 <2e-16 ***
## ContractOne year 7.762e-03 3.844e-02 0.202 0.840
## ContractTwo year -2.599e-02 4.629e-02 -0.562 0.574
## PaperlessBillingYes -2.039e-02 2.739e-02 -0.744 0.457
## PaymentMethodCredit card 9.687e-04 3.711e-02 0.026 0.979
## PaymentMethodElectronic check -1.768e-02 3.644e-02 -0.485 0.628
## PaymentMethodMailed check -1.403e-02 3.949e-02 -0.355 0.722
## ChurnYes -2.184e-02 3.262e-02 -0.670 0.503
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.026 on 7020 degrees of freedom
## Multiple R-squared: 0.9988, Adjusted R-squared: 0.9988
## F-statistic: 2.75e+05 on 22 and 7020 DF, p-value: < 2.2e-16
To display the regression, I will display a coefficient Plot Bar Chart of Estimates
coeffs <- broom::tidy(model) %>%
filter(term != "(Intercept)") %>%
arrange(desc(estimate))
ggplot(coeffs, aes(x = reorder(term, estimate), y = estimate)) +
geom_col(fill = project_colors["Yes"]) +
coord_flip() +
labs(
title = "Regression Coefficients: Impact on Monthly Charges",
x = "Predictor",
y = "Estimated Impact ($)"
) +
theme_minimal()
ggplot(telco, aes(x = InternetService, y = MonthlyCharges, fill = InternetService)) +
geom_boxplot() +
scale_fill_manual(values = c(
"DSL" = "#16A085",
"Fiber optic" = "#D35400",
"No" = "#95A5A6"
)) +
labs(
title = "Monthly Charges by Internet Service Type",
x = "Internet Service",
y = "Monthly Charges (USD)"
) +
theme_minimal()
Monthly Charges by Streaming TV Usage
ggplot(telco, aes(x = StreamingTV, y = MonthlyCharges, fill = StreamingTV)) +
geom_boxplot() +
scale_fill_manual(values = c("Yes" = "#E74C3C", "No" = "#3498DB")) +
labs(
title = "Monthly Charges by Streaming TV Subscription",
x = "Streaming TV",
y = "Monthly Charges (USD)"
) +
theme_minimal()
Monthly Charges by Tech Support
ggplot(telco, aes(x = TechSupport, y = MonthlyCharges, fill = TechSupport)) +
geom_boxplot() +
scale_fill_manual(values = c("Yes" = "#E74C3C", "No" = "#3498DB")) +
labs(
title = "Monthly Charges by Tech Support Access",
x = "Tech Support",
y = "Monthly Charges (USD)"
) +
theme_minimal()
Monthly Charges by Paperless Billing
ggplot(telco, aes(x = PaperlessBilling, y = MonthlyCharges, fill = PaperlessBilling)) +
geom_boxplot() +
scale_fill_manual(values = c("Yes" = "#E74C3C", "No" = "#3498DB")) +
labs(
title = "Monthly Charges by Paperless Billing Status",
x = "Paperless Billing",
y = "Monthly Charges (USD)"
) +
theme_minimal()
5. Relationship between two quantitative variables Relationship Between
Two Variables – Tenure vs Monthly Charges (Scatter Plot) Now I explore
how a numeric variable relates to churn. Plotting MonthlyCharges against
tenure, and using color to show churn status, can reveal if churners
cluster in some region.
telco$Churn <- factor(trimws(telco$Churn), levels = c("No", "Yes"))
ggplot(telco, aes(x = tenure, y = MonthlyCharges, color = Churn)) +
geom_point(alpha = 0.6) +
labs(title = "Monthly Charges vs Tenure, by Churn Status",
x = "Tenure (months)", y = "Monthly Charges (USD)") +
scale_color_manual(values = project_colors[c("No", "Yes")]) +
theme_minimal()
Create a chart not directly discussed in class (e.g. Heatmap, Radar, Contour, Sunburst) 6. Heatmap – Churn Rate by Contract and Internet Service Finally, I wanted to create a heatmap to visualize churn rates across two categorical dimensions: Contract type and InternetService type. This will highlight combinations (e.g. Fiber optic + month-to-month) that have especially high churn.
churn_rate <- telco %>%
group_by(Contract, InternetService) %>%
summarize(churn_pct = mean(Churn == "Yes") * 100)
## `summarise()` has grouped output by 'Contract'. You can override using the
## `.groups` argument.
ggplot(churn_rate, aes(x = Contract, y = InternetService, fill = churn_pct)) +
geom_tile(color="white") +
scale_fill_gradient(low="lightyellow", high="red", name="Churn Rate (%)") +
labs(title="Churn Rate Heatmap: Contract vs Internet Service",
x="Contract Type", y="Internet Service") +
theme_minimal()
7. Bubble Plot – Customer Charges Over Time by Contract Type I wanted to
highlight how customers on month-to-month contracts often accumulate
high charges earlier, while those on longer contracts tend to accrue
charges more gradually.
Key Churn point
churn_by_tenure <- telco %>%
group_by(tenure) %>%
summarize(
churn_rate = mean(Churn == "Yes")
)
ggplot(churn_by_tenure, aes(x = tenure, y = churn_rate)) +
geom_line(color = "steelblue") +
geom_point() +
labs(
title = "Churn Rate by Tenure",
x = "Tenure (months)",
y = "Churn Rate"
)
churn_by_tenure <- telco %>%
group_by(tenure) %>%
summarize(
churn_rate = mean(Churn == "Yes")
)
inflection <- ggplot(churn_by_tenure, aes(x = tenure, y = churn_rate)) +
geom_smooth(method = "loess", span = 0.2, se = FALSE, color = "darkred") +
labs(
title = "Smoothed Churn Rate by Tenure",
x = "Tenure (months)",
y = "Churn Rate"
)
plot <- ggplotly(inflection)
## `geom_smooth()` using formula = 'y ~ x'
inflection
## `geom_smooth()` using formula = 'y ~ x'
saveWidget(plot, "inflection.html", selfcontained = TRUE)